Importing Libraries
# Visualization and Analysis
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(ggdendro)
library(readxl)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
suppressWarnings({
library(readxl)
})
# Modeling and Inference
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## Loading required package: lattice
library(nnet)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-7
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
library(e1071)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
df <- read_excel("mine_data.xlsx")
summary(df)
## V H S M
## Min. :0.1977 Min. :0.0000 Min. :0.0000 Min. :1.000
## 1st Qu.:0.3097 1st Qu.:0.2727 1st Qu.:0.2000 1st Qu.:2.000
## Median :0.3595 Median :0.5455 Median :0.6000 Median :3.000
## Mean :0.4306 Mean :0.5089 Mean :0.5036 Mean :2.953
## 3rd Qu.:0.4826 3rd Qu.:0.7273 3rd Qu.:0.8000 3rd Qu.:4.000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :5.000
df %>%
ggplot(aes(y = M, x = V)) +
geom_point(color = "blue") +
theme_bw()

df %>%
ggplot(aes(y = M, x = H)) +
geom_point(color = "red") +
theme_bw()

df %>%
ggplot(aes(y = M, x = S)) +
geom_point(color = "blue") +
theme_bw()

K-Means Clustering
features <- df[, c("V", "H", "S")]
labels <- df[,c("M")]
kmeans_model <- kmeans(features, 5, nstart=20)
df$clusters <- kmeans_model$cluster
confusion_matrix <- table(df$clusters, df$M)
print(confusion_matrix)
##
## 1 2 3 4 5
## 1 0 30 1 0 0
## 2 17 15 18 16 18
## 3 18 13 16 15 13
## 4 18 0 14 17 17
## 5 18 12 17 18 17
plot_ly(x=df$V, y=df$H, z=df$S, color=df$M, type="scatter3d")
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(x=df$V, y=df$H, z=df$S, color=df$clusters, type="scatter3d")
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
Analysis of Number of Clusters
k_values <- 1:8 # Adjust the range as needed
wss_values <- vector("numeric", length(k_values))
for (k in k_values) {
kmeans_model <- kmeans(df[,1:3], k)
wss <- sum(kmeans_model$withinss)
wss_values[k] <- wss
}
elbow_plot <- ggplot(data.frame(k = k_values, wss = wss_values), aes(x = k, y = wss)) +
geom_line() +
geom_point() +
labs(title = "Elbow Plot for K-means Clustering",
x = "Number of Clusters (k)",
y = "Within-Cluster Sum of Squares (WCSS)")
ggplotly(elbow_plot)
new_kmeans_model <- kmeans(features, 4, nstart=20)
df$clusters2 <- new_kmeans_model$cluster
plot_ly(x=df$V, y=df$H, z=df$S, color=df$clusters2, type="scatter3d")
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
Hierarchical Clustering
h_clust.complete <- hclust(dist(features), method = "complete")
h_clust.single <- hclust(dist(features), method = "single")
h_clust.average <- hclust(dist(features), method = "average")
ggplotly(ggdendrogram(h_clust.complete, rotate = FALSE, size = 2))
ggplotly(ggdendrogram(h_clust.single, rotate = FALSE, size = 2))
ggplotly(ggdendrogram(h_clust.average, rotate = FALSE, size = 2))
Classification Performances
set.seed(42)
sample1 <- sample(c(TRUE, FALSE), nrow(df), replace=TRUE, prob=c(0.8,0.2))
train1 <- df[sample1,]
test1 <- df[!sample1,]
train1$M <- factor(train1$M)
test1$M <- factor(test1$M)
Logistic Regression
# Fitting a logistic regression model
lr_model <- multinom(M ~ V + H + S, data=train1)
## # weights: 25 (16 variable)
## initial value 436.157674
## iter 10 value 337.522543
## iter 20 value 280.974520
## iter 30 value 280.278933
## iter 40 value 280.272250
## iter 50 value 280.271867
## final value 280.271806
## converged
# Print the summary of the model
print(summary(lr_model))
## Call:
## multinom(formula = M ~ V + H + S, data = train1)
##
## Coefficients:
## (Intercept) V H S
## 2 -33.320353 64.78419 14.074729 -3.43616821
## 3 -11.744900 28.89174 4.209780 -0.77318354
## 4 -4.939808 14.50535 1.366517 -0.53371800
## 5 -9.471749 23.92129 2.916064 -0.03414481
##
## Std. Errors:
## (Intercept) V H S
## 2 4.882370 8.977899 2.4262340 1.4245505
## 3 1.875182 4.471560 0.9939078 0.6889898
## 4 1.395354 3.751771 0.7654476 0.6029561
## 5 1.675540 4.167169 0.8911530 0.6572570
##
## Residual Deviance: 560.5436
## AIC: 592.5436
# Predict on the test set
lr_pred <- predict(lr_model, test1)
# Model Diagnostics
confusion_matrix_1 <- table(test1$M, lr_pred)
accuracy <- mean(diag(confusion_matrix_1))
precision <- precision(confusion_matrix_1)
recall <- recall(confusion_matrix_1)
print(accuracy)
## [1] 6.2
print(precision)
## [1] NA
print(recall)
## [1] NA
print(confusion_matrix_1)
## lr_pred
## 1 2 3 4 5
## 1 15 0 1 4 1
## 2 0 9 1 0 0
## 3 0 1 4 3 6
## 4 3 0 1 0 6
## 5 1 0 6 2 3
plot_ly(
x = c(1,2,3,4,5), y = c(1,2,3,4,5),
z = confusion_matrix_1, type = "heatmap", colorscale = 'Greys'
)
Linear Support Vector Machine
# Fitting a SVM model
svm_model <- svm(M ~ V + H + S, data=train1, kernel="linear")
# Print the summary of the model
print(summary(svm_model))
##
## Call:
## svm(formula = M ~ V + H + S, data = train1, kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 223
##
## ( 44 18 52 56 53 )
##
##
## Number of Classes: 5
##
## Levels:
## 1 2 3 4 5
# Predict on the test set
svm_pred <- predict(svm_model, test1)
# Model Diagnostics
confusion_matrix_2 <- table(test1$M, svm_pred)
accuracy <- mean(diag(confusion_matrix_2))
precision <- precision(confusion_matrix_2)
recall <- recall(confusion_matrix_2)
print(accuracy)
## [1] 6.4
print(precision)
## [1] NA
print(recall)
## [1] NA
print(confusion_matrix_2)
## svm_pred
## 1 2 3 4 5
## 1 15 0 2 2 2
## 2 0 9 1 0 0
## 3 0 1 4 3 6
## 4 3 0 2 0 5
## 5 1 0 6 1 4
plot_ly(
x = c(1,2,3,4,5), y = c(1,2,3,4,5),
z = confusion_matrix_2, type = "heatmap", colorscale = 'Greys'
)
Radial Support Vector Machine
# Fitting a SVM model
svm_model <- svm(M ~ V + H + S, data=train1, kernel="radial")
# Print the summary of the model
print(summary(svm_model))
##
## Call:
## svm(formula = M ~ V + H + S, data = train1, kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 239
##
## ( 47 31 52 56 53 )
##
##
## Number of Classes: 5
##
## Levels:
## 1 2 3 4 5
# Predict on the test set
svm_pred <- predict(svm_model, test1)
# Model Diagnostics
confusion_matrix_3 <- table(test1$M, svm_pred)
accuracy <- mean(diag(confusion_matrix_3))
precision <- precision(confusion_matrix_3)
recall <- recall(confusion_matrix_3)
print(accuracy)
## [1] 4.8
print(precision)
## [1] NA
print(recall)
## [1] NA
print(confusion_matrix_3)
## svm_pred
## 1 2 3 4 5
## 1 6 0 3 8 4
## 2 0 8 1 1 0
## 3 0 1 4 2 7
## 4 2 0 0 2 6
## 5 1 0 1 6 4
plot_ly(
x = c(1,2,3,4,5), y = c(1,2,3,4,5),
z = confusion_matrix_3, type = "heatmap", colorscale = 'Greys'
)